#knitr::opts_chunk$set(echo = FALSE, error=TRUE, message=FALSE, warning=FALSE)
library(tidyverse) # for data cleaning and plotting
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate) # for date manipulation
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(openintro) # for the abbr2state() function
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(maps) # for map data
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(ggmap) # for mapping points on maps
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(gplots) # for col2hex() function
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer) # for color palettes
library(sf) # for working with spatial data
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(leaflet) # for highly customizable mapping
library(ggthemes)
library(plotly) # for the ggplotly() - basic interactivity
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gganimate) # for adding animation layers to ggplots
library(gifski) # for creating the gif (don't need to load this library every time,but need it installed)
library(transformr) # for "tweening" (gganimate)
##
## Attaching package: 'transformr'
## The following object is masked from 'package:sf':
##
## st_normalize
library(shiny) # for creating interactive apps
library(patchwork) # for nicely combining ggplot2 graphs
library(gt) # for creating nice tables
##
## Attaching package: 'gt'
## The following object is masked from 'package:openintro':
##
## sp500
library(countrycode)
theme_set(theme_minimal())
library(tidytext)
Every four years (or two), the world holds its breath as individual humans represent whole countries through their mastery of a sport. We data scientists have analyzed the Summer Olympics from the year 1896 to 2016–yes, 120 years of the top athletes competing for the glory, and, perhaps more importantly, the gold.
This data hails from Kaggle from the games in Athens 1896 to Rio 2016; more recently, it was popularized through the weekly R challenge “Tidy Tuesday.” Until 1992, the Winter and Summer Games coincided in the same year, which results in odd data points like a Figure Skating victory slipping into the Summer Olympics data sets. Other aspects of the history of the Olympics include the existence and end to whole nations as the years progress. Examples include East and West Germany and the Soviet Union–as well as unrecognized independent territories such as Kosovo and Taiwan. Our team’s methodology to eliminate our own biases was to keep the data points that reflect 2016’s Olympic Games for sports measured and the countries recognized are universally agreed upon by the international community.
olympics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv')
## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, sex, team, noc, games, season, city, sport, event, medal
## dbl (5): id, age, height, weight, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
world <- map_data("world")
medal_colors <- c(
Gold = "#d5a500",
Silver = "#b7b7b7",
Bronze = "#a17419")
Filtered Data Sets
summer_olympics <- olympics %>%
filter(season == "Summer") %>%
mutate(countryName = countrycode(noc, "genc3c", "country.name")) %>%
mutate(countryName = ifelse(countryName == "United States", "USA", countryName)) %>%
mutate(countryName = ifelse(noc == "ALG", "Algeria", countryName)) %>%
mutate(countryName = ifelse(noc == "BAH", "Bahamas", countryName)) %>%
mutate(countryName = ifelse(noc == "BUL", "Bulgaria", countryName)) %>%
mutate(countryName = ifelse(noc == "CHI", "Chile", countryName)) %>%
mutate(countryName = ifelse(noc == "CRC", "Costa Rica", countryName)) %>%
mutate(countryName = ifelse(noc == "CRO", "Croatia", countryName)) %>%
mutate(countryName = ifelse(noc == "CZA", "Czech Republic", countryName)) %>%
mutate(countryName = ifelse(noc == "DEN", "Denmark", countryName)) %>%
mutate(countryName = ifelse(noc == "FIJ", "Fiji", countryName)) %>%
mutate(countryName = ifelse(noc == "GER", "Germany", countryName)) %>%
mutate(countryName = ifelse(noc == "GRE", "Greece", countryName)) %>%
mutate(countryName = ifelse(noc == "GRN", "Grenada", countryName)) %>%
mutate(countryName = ifelse(noc == "HAI", "Haiti", countryName)) %>%
mutate(countryName = ifelse(noc == "INA", "Indonesia", countryName)) %>%
mutate(countryName = ifelse(noc == "IRI", "Iran", countryName)) %>%
mutate(countryName = ifelse(noc == "LAT", "Latvia", countryName)) %>%
mutate(countryName = ifelse(noc == "MGL", "Mongolia", countryName)) %>%
mutate(countryName = ifelse(noc == "NED", "Netherlands", countryName)) %>%
mutate(countryName = ifelse(noc == "NGR", "Nigeria", countryName)) %>%
mutate(countryName = ifelse(noc == "POR", "Portugal", countryName)) %>%
mutate(countryName = ifelse(noc == "PUR", "Puerto Rico", countryName)) %>%
mutate(countryName = ifelse(noc == "RSA", "South Africa", countryName)) %>%
mutate(countryName = ifelse(noc == "SLO", "Slovenia", countryName)) %>%
mutate(countryName = ifelse(noc == "SUI", "Switzerland", countryName)) %>%
mutate(countryName = ifelse(noc == "UAE", "United Arab Emirates", countryName)) %>%
mutate(countryName = ifelse(noc == "URU", "Uruguay", countryName)) %>%
mutate(countryName = ifelse(noc == "VIE", "Vietnam", countryName)) %>%
mutate(countryName = ifelse(noc == "ZIM", "Zimbabwe", countryName))
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: AHO, ALG, ANG, ANT, ANZ, ARU, ASA, BAH, BAN, BAR, BER, BHU, BIZ, BOH, BOT, BRU, BUL, BUR, CAM, CAY, CGO, CHA, CHI, CRC, CRO, CRT, DEN, ESA, EUN, FIJ, FRG, GAM, GBS, GDR, GEQ, GER, GRE, GRN, GUA, GUI, HAI, HON, INA, IOA, IRI, ISV, IVB, KOS, KSA, KUW, LAT, LBA, LES, LIB, MAD, MAL, MAS, MAW, MGL, MON, MRI, MTN, MYA, NBO, NCA, NED, NEP, NFL, NGR, NIG, OMA, PAR, PHI, PLE, POR, PUR, RHO, ROT, RSA, SAA, SAM, SCG, SEY, SKN, SLO, SOL, SRI, SUD, SUI, TAN, TCH, TGA, TOG, TPE, UAE, UAR, UNK, URS, URU, VAN, VIE, VIN, WIF, YAR, YMD, YUG, ZAM, ZIM
# Footnote or mention in part of the project the choice to leave out territories not fully recognized, such as Kosovo and Chinese Taipei, and countries that no longer exist, including the Soviet Union and Yugoslavia.
summer_gold <- summer_olympics %>%
drop_na(medal) %>%
filter(medal == "Gold")
summer_medals <- summer_olympics %>%
drop_na(medal)
medal_prop <- summer_olympics %>%
count(countryName, medal) %>%
group_by(countryName) %>%
filter(any(!is.na(medal))) %>%
mutate(medal = ifelse(is.na(medal), "No Medal", medal)) %>%
pivot_wider(
names_from = medal,
values_from = n,
values_fill = 0) %>%
mutate(
total_medals = Gold + Silver + Bronze,
total_events = total_medals + `No Medal`,
p_medal = total_medals / total_events) %>%
arrange(desc(total_medals))
Female-only, medal-winning data:
Instructions that might be useful to remember:
NEW!! With animated graphs, add eval=FALSE to the code chunk that creates the animation and saves it using anim_save(). Add another code chunk to reread the gif back into the file. See the tutorial for help.
When you are finished with ALL the exercises, uncomment the options at the top so your document looks nicer. Don’t do it before then, or else you might miss some important warnings and messages.
In general, the net amount of medals won is favored by sports within which have multiple opportunities to medal. One example includes Athletics, also known as “Track and Field,” having multiple races, team relays, and field events for athletes to earn medals in. Other result by contrasting by gender shows both the lack of female inclusion for a significant period of time, reflecting gender dynamics over the decades, as well as sports that are more dominated by one gender. Synchronized Swimming, Rhythmic Gymnastics appear to only have female champions, while Wrestling, the Modern Pentathlon, Rugby, and Lacrosse appear to have majority male winners.
## # A tibble: 8,665 × 16
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 37 Ann Kr… F 23 182 64 Norway NOR 1996… 1996 Summer Atla…
## 2 65 Patima… F 21 165 49 Azerb… AZE 2016… 2016 Summer Rio …
## 3 67 Mariya… F 22 179 80 Russia RUS 2008… 2008 Summer Beij…
## 4 90 Tamila… F 21 163 60 Russia RUS 2004… 2004 Summer Athi…
## 5 153 Monica… F 23 191 88 Unite… USA 2008… 2008 Summer Beij…
## 6 165 Nia Ni… F 20 175 56 Unite… USA 2004… 2004 Summer Athi…
## 7 259 Reema … F 21 173 59 Canada CAN 1984… 1984 Summer Los …
## 8 394 Irene … F 19 160 48 East … GDR 1972… 1972 Summer Muni…
## 9 395 Jennif… F 20 160 62 Canada CAN 2012… 2012 Summer Lond…
## 10 428 Elvan … F 25 159 40 Turkey TUR 2008… 2008 Summer Beij…
## # … with 8,655 more rows, and 4 more variables: sport <chr>, event <chr>,
## # medal <chr>, countryName <chr>
Above is a graph that shows all the female athletes in one graph, showing a wide range of heights and weights of Olympians. However, due to the amount of sports, the graph itself is very noisy. To better understand the trends, the same data is compiled into four different categories.
The first category is “Holding an object”: These athletes are holding onto something, whether it is the reigns of a horse or a racket, they are armed.
The second is “Wearing protective gear.” The athletes in this category are wearing an extra layer designed for their sport to reduce injury.
The third is “In water.” As it sounds, these athletes are found in the water to compete.
Lastly, there is “Just self.” These athletes are not holding anything, wearing something meant to protect them, or are in the water.
## Warning in sport == c("Badminton", "Cycling", "Equestrianism", "Golf",
## "Hockey", : longer object length is not a multiple of shorter object length
## # A tibble: 133 × 16
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1169 "Maril… F 33 172 58 Nethe… NED 2012… 2012 Summer Lond…
## 2 1546 "Eirin… F 17 167 45 Greece GRE 2000… 2000 Summer Sydn…
## 3 4418 "Alyso… F 27 162 65 Austr… AUS 2000… 2000 Summer Sydn…
## 4 6624 "Diana… F 33 175 85 Italy ITA 2016… 2016 Summer Rio …
## 5 11448 "Fiona… F 40 173 75 Great… GBR 2016… 2016 Summer Rio …
## 6 12113 "Elisa… F 16 168 49 Italy ITA 2004… 2004 Summer Athi…
## 7 14578 "Jeane… F 26 169 60 Great… GBR 2000… 2000 Summer Sydn…
## 8 15540 "Joann… F 28 174 72 Austr… AUS 2000… 2000 Summer Sydn…
## 9 17479 "Els C… F 30 177 58 Belgi… BEL 2000… 2000 Summer Sydn…
## 10 22054 "Mario… F 32 170 60 France FRA 1996… 1996 Summer Atla…
## # … with 123 more rows, and 4 more variables: sport <chr>, event <chr>,
## # medal <chr>, countryName <chr>
## Warning in sport == c("Boxing", "Judo", "Fencing", "Football", "Taekwondo", :
## longer object length is not a multiple of shorter object length
## # A tibble: 152 × 16
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1191 "Judit… F 27 164 62 Hunga… HUN 1964… 1964 Summer Tokyo
## 2 1730 "Monic… F 26 183 74 Unite… USA 2016… 2016 Summer Rio …
## 3 2769 "Kimia… F 18 185 57 Iran IRI 2016… 2016 Summer Rio …
## 4 3593 "An Ku… F 28 160 52 North… PRK 2008… 2008 Summer Beij…
## 5 4101 "Andri… F 26 175 65 Brazil BRA 2004… 2004 Summer Athi…
## 6 5631 "Kosov… F 27 166 56 Sweden SWE 2016… 2016 Summer Rio …
## 7 7597 "Bao Y… F 24 172 67 China CHN 2008… 2008 Summer Beij…
## 8 7717 "Brbar… F 20 171 71 Brazil BRA 2008… 2008 Summer Beij…
## 9 7920 "Valri… F 27 165 53 France FRA 1996… 1996 Summer Atla…
## 10 8364 "Saski… F 33 170 60 Germa… GER 2016… 2016 Summer Rio …
## # … with 142 more rows, and 4 more variables: sport <chr>, event <chr>,
## # medal <chr>, countryName <chr>
## Warning in sport == c("Canoeing", "Diving", "Water Polo", "Rowing",
## "Swimming", : longer object length is not a multiple of shorter object length
## # A tibble: 406 × 16
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 395 "Jenni… F 20 160 62 Canada CAN 2012… 2012 Summer Lond…
## 2 980 "Rebec… F 23 179 70 Great… GBR 2012… 2012 Summer Lond…
## 3 1380 "Chris… F 22 172 52 Unite… USA 1992… 1992 Summer Barc…
## 4 2443 "Joan … F 17 170 59 Unite… USA 1952… 1952 Summer Hels…
## 5 2464 "Jo Qe… F 30 171 58 New Z… NZL 2016… 2016 Summer Rio …
## 6 2575 "Lisa … F 27 172 58 Canada CAN 1996… 1996 Summer Atla…
## 7 3070 "Malin… F 23 180 64 Sweden SWE 2000… 2000 Summer Sydn…
## 8 3880 "Agnet… F 31 172 68 Sweden SWE 1992… 1992 Summer Barc…
## 9 4393 "Herta… F 26 183 82 Roman… ROU 1988… 1988 Summer Seoul
## 10 4720 "Tessa… F 27 177 71 Nethe… NED 2000… 2000 Summer Sydn…
## # … with 396 more rows, and 4 more variables: sport <chr>, event <chr>,
## # medal <chr>, countryName <chr>
## Warning in sport == c("Athletics", "Basketball", "Beach Volleyball",
## "Gymnastics", : longer object length is not a multiple of shorter object length
## # A tibble: 353 × 16
## id name sex age height weight team noc games year season city
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 428 Elvan A… F 25 159 40 Turk… TUR 2008… 2008 Summer Beij…
## 2 610 Ginko A… F 26 148 46 Japan JPN 1964… 1964 Summer Tokyo
## 3 1020 Adriana… F 29 180 61 Braz… BRA 2000… 2000 Summer Sydn…
## 4 1071 Kseniya… F 20 158 48 Russ… RUS 2012… 2012 Summer Lond…
## 5 1254 Macaren… F 27 170 67 Spain ESP 2012… 2012 Summer Lond…
## 6 1313 Robyn M… F 32 172 67 Unit… USA 2008… 2008 Summer Beij…
## 7 1564 Chioma … F 24 164 57 Nige… NGR 1996… 1996 Summer Atla…
## 8 1732 Liudmyl… F 29 165 58 Sovi… URS 1976… 1976 Summer Mont…
## 9 2546 Alessan… F 22 200 85 Braz… BRA 1996… 1996 Summer Atla…
## 10 3281 Simona … F 16 158 44 Roma… ROU 1996… 1996 Summer Atla…
## # … with 343 more rows, and 4 more variables: sport <chr>, event <chr>,
## # medal <chr>, countryName <chr>
The map above adds depth to our understanding of Olympic medal winning countries. Previous maps show an extreme domination in the Olympics by the United States of America, but taking the wins in relation to total participation, other countries emerge as prolific medal winners as well. Russia, Finland, Germany, and China also tend to win more often when their athletes are competing. The United States still is at the highest level of medal wins. One other aspect to note are all the medals won by countries that no longer exist today, like the USSR and East and West Germany. This graph tells a more accurate story of where champions are from, but cannot wholly depict the wins every achieved in the world.
##Gabby’s Graph
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: ALG, ANZ, BAH, BUL, CHI, CRC, CRO, DEN, EUN, FIJ, FRG, GDR, GER, GRE, GRN, HAI, INA, IOA, IRI, KOS, LAT, MGL, NED, NGR, POR, PUR, RSA, SCG, SLO, SUI, TCH, TPE, UAE, URS, URU, VIE, YUG, ZIM
The map above gives us a basic idea of how many gold medals are earned per country in the olympics. It’s very obvious that the United States outnumber every other country by far with the second and third place countries having over 1000 less.As the numbers of gold medals won decrease, it becomes harder to distinguish the count. While this graph gives us an idea of what countries dominate the summer olympics, we have little details on what events they are winning them in.
Above is a series of 34 graphs, each graph represents a country that has won more than 20 gold medals, and displays what events they have received the gold medals in. These graphs give us a detailed look on the events that are winning the most gold medals for each country that we are looking at. These graphs allow us to compare and contrast the countries since they are all in one place, and easy to distinguish the sports because not only did we color code them, we included the names of them on the y-axis to make it easier. It’s interesting to see which countries have won gold medals in only one sport, and what sport that is. By the same token, these graphs are multipurposeful because they show us how many sports each country has won gold medals in, and the distribution of those medals as well.
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: AHO, ALG, ANG, ANT, ANZ, ARU, ASA, BAH, BAN, BAR, BER, BHU, BIZ, BOH, BOT, BRU, BUL, BUR, CAM, CAY, CGO, CHA, CHI, CRC, CRO, CRT, DEN, ESA, EUN, FIJ, FRG, GAM, GBS, GDR, GEQ, GER, GRE, GRN, GUA, GUI, HAI, HON, INA, IOA, IRI, ISV, IVB, KOS, KSA, KUW, LAT, LBA, LES, LIB, MAD, MAL, MAS, MAW, MGL, MON, MRI, MTN, MYA, NBO, NCA, NED, NEP, NFL, NGR, NIG, OMA, PAR, PHI, PLE, POR, PUR, RHO, ROT, RSA, SAA, SAM, SCG, SEY, SKN, SLO, SOL, SRI, SUD, SUI, TAN, TCH, TGA, TOG, TPE, UAE, UAR, UNK, URS, URU, VAN, VIE, VIN, WIF, YAR, YMD, YUG, ZAM, ZIM
Above are two graphs dealing with a similar concept, looking at earned medals as a whole for the 10 highest winning countries. Previous graphs have only focused on gold medals, while these two graphs bring in all three medals, providing us with a bigger picture. The first graph, the plotly, displays the distribution of medals won by the top 10 countries. Since this graph is proportional we can compare the amount of medals earned by countries by the percentage distribution of each medal. The second graph, gives us similar information but doesn’t show the number of medals as proportions. Here we just see how many of each type of medal each of our top countries have won, in descending order. We can still somewhat estimate the proportion of each medal for each country but here we see which countries have the most medals once again.